library(SummarizedExperiment)
library(SingleCellExperiment)
library(GEOquery)
library(ISLET)
library(Seurat)
library(MuSiC)
library(dplyr)
library(ggplot2)
library(tidyr)
library(fgsea)
library(Matrix)
library(pheatmap)
library(stringr)
library(gridExtra)
library(reshape2)
library(biomaRt)
library(FARDEEP)
library(linseed)To examine molecular changes throughout disease progression, brain tissue from male and female, 3xTg-AD and B6129 control mice, between the ages of 6- and 24-months-old were collected.
Time points: 6 Months - 9 Months - 12 Months - 15 Months - 18 Months - 21 Months - 24 Months
rm(list=ls())
setwd("/Users/ziyiou/CUHKSZ/23 Fall/Genomics/longitudinal analysis")
#library(GEOquery)
data <- getGEO("GSE254970")
gset <- data[[1]]
exprs_data <- read.csv("/Users/ziyiou/CUHKSZ/23 Fall/Genomics/longitudinal analysis/data/Alzheimer mice/GSE254970_processed_data.csv", sep=",", header = TRUE)
pdata <- pData(gset)
#colnames(exprs_data) = pdata$geo_accessionIn the raw data there are 24080 genes with a few duplicates. We deleted the duplicated ones here. Now there are 24022 genes left.
## Number of genes after deleting replicated ones: 24022
We conduct quality control (QC) to our data. Notably, the quality control of the scRNA-seq for reference profile is the same as here.
# Keep only "detectable" genes: at least 5% of cells (regardless of the group) have a read/UMI count different from 0
keep <- which(Matrix::rowSums(exprs_data > 0) >= round(0.05 * ncol(exprs_data)))
exprs_data = exprs_data[keep,]
cat("Number of genes after QC:", nrow(exprs_data))## Number of genes after QC: 19619
We used the mouse brain single cell RNA sequencing data from
GSE129788 to construct the single cell signature gene
matrix, similar to Ren’s work (2023). Specifically, we choose the
GSM3722100 and GSM3722108 to construct
reference profile.
Ren’s work: https://www.nature.com/articles/s41598-023-44183-7#Sec15
Firstly we use Seurat to conduct clustering analysis on
GSM3722100 and GSM3722108. According to the
supplementary file of GSE129788 from GEO, there are 6 types
of cells in this scRNA-seq, they are: IMMUNE_Lin (Immune Lineage),
OLG_Lin (Oligodendrocyte Lineage), Neuron Lineage (NEURON_Lin), Vascular
Lineage (VASC_Lin), Astrocyte Lineage (ASC_Lin) and Ependymal Cell
Lineage (EPC_Lin).
m1 <- read.table("//Users/ziyiou/CUHKSZ/23 Fall/Genomics/longitudinal analysis/data/Alzheimer mice/GSM3722100_YX1L_10X.txt", sep = "\t")
m2 <- read.table("/Users/ziyiou/CUHKSZ/23 Fall/Genomics/longitudinal analysis/data/Alzheimer mice/GSM3722108_OX1X_10X.txt", sep = "\t")
##################### QC #####################
# First: cells with library size, mitochondrial or ribosomal content further than three MAD away were discarded
filterCells <- function(filterParam){
cellsToRemove <- which(filterParam > median(filterParam) + 3 * mad(filterParam) | filterParam < median(filterParam) - 3 * mad(filterParam) )
cellsToRemove
}
sc_data <- do.call(cbind, list(m1, m2))
libSizes <- colSums(sc_data)
gene_names <- rownames(sc_data)
mtID <- grepl("^MT-|_MT-", gene_names, ignore.case = TRUE)
rbID <- grepl("^RPL|^RPS|_RPL|_RPS", gene_names, ignore.case = TRUE)
mtPercent <- colSums(sc_data[mtID, ])/libSizes
rbPercent <- colSums(sc_data[rbID, ])/libSizes
lapply(list(libSizes = libSizes, mtPercent = mtPercent, rbPercent = rbPercent), filterCells) %>%
unlist() %>%
unique() -> cellsToRemove
if(length(cellsToRemove) != 0){
sc_data <- sc_data[,-cellsToRemove]
}
# Keep only "detectable" genes: at least 5% of cells (regardless of the group) have a read/UMI count different from 0
keep <- which(Matrix::rowSums(sc_data > 0) >= round(0.05 * ncol(sc_data)))
sc_data = sc_data[keep,]
##################### Clustering #####################
sc_seurat_obj <- CreateSeuratObject(counts = sc_data)
sc_seurat_obj <- NormalizeData(sc_seurat_obj)
sc_seurat_obj <- FindVariableFeatures(sc_seurat_obj, nfeatures = 2000)
sc_seurat_obj <- ScaleData(sc_seurat_obj)
# PCA
sc_seurat_obj <- RunPCA(sc_seurat_obj, verbose=FALSE)
VizDimLoadings(sc_seurat_obj, dims = 1:2, reduction = "pca")# Clustering
sc_seurat_obj <- FindNeighbors(sc_seurat_obj, dims = 1:10)
sc_seurat_obj <- FindClusters(sc_seurat_obj, resolution = 0.025)## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 2931
## Number of edges: 97041
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9937
## Number of communities: 6
## Elapsed time: 0 seconds
# find marker genes of each cluster
cluster_markers <- FindAllMarkers(sc_seurat_obj, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25, verbose = FALSE)
# choose top marker
top_markers <- cluster_markers %>%
group_by(cluster) %>%
top_n(n = 10, wt = avg_log2FC)
top_genes <- top_markers %>% group_by(cluster) %>% top_n(n = 5, wt = avg_log2FC)
#FeaturePlot(sc_seurat_obj, features = unique(top_genes$gene), cols = c("blue", "red"), reduction = "tsne")
new_cluster_ids <- c("CellType1", "CellType2", "CellType3", "CellType4", "CellType5", "CellType6")
names(new_cluster_ids) <- levels(sc_seurat_obj)
sc_seurat_obj <- RenameIdents(sc_seurat_obj, new_cluster_ids)
# bulk signature matrix
average_expression <- AverageExpression(sc_seurat_obj)
bulk_signature_matrix <- average_expression$RNA
head(bulk_signature_matrix)## 6 x 6 sparse Matrix of class "dgCMatrix"
## CellType1 CellType2 CellType3 CellType4 CellType5 CellType6
## Sox17 0.007098499 0.04946768 0.008797571 5.87056005 0.03519064 .
## Mrpl15 0.785016347 1.27775229 0.850610239 1.00260087 1.16316468 0.9736278
## Lypla1 0.511008341 0.79224327 0.532664894 0.99712555 0.98537447 0.5917463
## Tcea1 1.295726505 1.04054874 1.003208501 1.32072542 0.92234978 1.2661495
## Rgs20 0.051042227 2.13839720 0.434447441 0.07072141 . 0.5036883
## Atp6v1h 0.840131906 0.91241426 1.490793849 0.62345006 1.01080326 1.0870432
write.csv(bulk_signature_matrix, "/Users/ziyiou/CUHKSZ/23 Fall/Genomics/longitudinal analysis/data/Alzheimer mice/signature_matrix bulk.csv")
# sc signature matrix
cell_ids <- colnames(sc_seurat_obj)
cluster_ids <- sc_seurat_obj$seurat_clusters
cell_types <- sc_seurat_obj@active.ident
pheno_sc_data <- data.frame(
CellID = cell_ids,
ClusterID = cluster_ids,
CellType = cell_types,
stringsAsFactors = FALSE
)
pheno_sc_data$SubjectName <- ifelse(pheno_sc_data$CellID %in% colnames(m1), "m1",
ifelse(pheno_sc_data$CellID %in% colnames(m2), "m2", NA))
sc_seurat_obj$SubjectName <- pheno_sc_data$SubjectName
sc_seurat_obj$CellType <- pheno_sc_data$CellType
# Visualization
sc_seurat_obj <- RunTSNE(sc_seurat_obj, dims = 1:10)
DimPlot(sc_seurat_obj, reduction = "tsne", group.by = c("CellType", "SubjectName")) # find variable feature (top 2,000)
markers <- VariableFeatures(sc_seurat_obj)
marker_distrib <- data.frame(genes = character(length(VariableFeatures(sc_seurat_obj))))
marker_distrib$genes <- VariableFeatures(sc_seurat_obj)
variable_gene_plot <- VariableFeaturePlot(sc_seurat_obj)
variable_gene_plot_with_label <- LabelPoints(plot= variable_gene_plot,
points = head(VariableFeatures(sc_seurat_obj), 10),
repel=TRUE, xnudge = 0, ynudge = 0)
variable_gene_plot_with_labelHere we apply TOAST for cell type deconvolution.
C = sc_data
T = exprs_data
pDataC = pheno_sc_data
########## MATRIX DIMENSION APPROPRIATENESS ##########
C_bulk = data.frame(bulk_signature_matrix)
keep = intersect(rownames(C_bulk),rownames(T))
C_bulk = C_bulk[keep,]
T_bulk = T[keep,]
C.eset <- Biobase::ExpressionSet(assayData = as.matrix(C),phenoData = Biobase::AnnotatedDataFrame(pDataC))
T.eset <- Biobase::ExpressionSet(assayData = as.matrix(T))
C.eset <- SingleCellExperiment(assays = list(counts = as.matrix(C)))
colData(C.eset) <- DataFrame(assigned_cluster = pDataC$CellType, SubjectName = rownames(pDataC))
#FARDEEP_RESULTS = t(FARDEEP::fardeep(C_bulk, T_bulk, nn = TRUE, intercept = TRUE, permn = 10, QN = FALSE)$abs.beta)
#FARDEEP_RESULTS = apply(FARDEEP_RESULTS,2,function(x) x/sum(x))
#head(FARDEEP_RESULTS[, 1:5])lo <- LinseedObject$new(T_bulk, samples=1:20, topGenes=10000)
lo$calculatePairwiseLinearity()
lo$calculateSpearmanCorrelation()
lo$calculateSignificanceLevel(100)
lo$significancePlot(0.01)
lo$filterDatasetByPval(0.01)
lo$svdPlot()
lo$setCellTypeNumber(6)
lo$project("full") # projecting full dataset
lo$projectionPlot(color="filtered")
lo$project("filtered")
lo$smartSearchCorners(dataset="filtered", error="norm")
lo$deconvolveByEndpoints()
rownames(lo$proportions) <- c("CellType1", "CellType2", "CellType3", "CellType4", "CellType5", "CellType6")
linseed_RESULTS <- data.frame(lo$proportions)
linseed_RESULTS#### TOAST
CellType <- list(CellType1 = 1,
CellType2 = 2,
CellType3 = 3,
CellType4 = 4,
CellType5 = 5,
CellType6 = 6)
myMarker <- ChooseMarker(C_bulk,
CellType,
nMarkCT = 20,
chooseSig = TRUE,
verbose = FALSE)
lapply(myMarker, head, 10)## $CellType1
## [1] "Cldn11" "Ermn" "Mog" "Opalin" "Ppp1r14a" "Mag"
## [7] "Hapln2" "Aspa" "Gjb1" "Slain1"
##
## $CellType2
## [1] "Gja1" "Gjb6" "Aqp4" "Acsbg1" "Fxyd1" "Slc6a11" "Slc7a10"
## [8] "Fgfr3" "Sox9" "Ntsr2"
##
## $CellType3
## [1] "Syt1" "Synpr" "Gad1" "Ndrg4" "Scg2" "Camk2b" "Grin2b" "Atp1a3"
## [9] "Snap25" "Gad2"
##
## $CellType4
## [1] "Ly6c1" "Ly6a" "Cldn5" "Flt1" "Pglyrp1" "Slco1a4" "Itm2a"
## [8] "Egfl7" "Cxcl12" "Hspb1"
##
## $CellType5
## [1] "C1qa" "C1qc" "Ctss" "C1qb" "Tyrobp" "Trem2" "Fcer1g" "Selplg"
## [9] "P2ry12" "Csf1r"
##
## $CellType6
## [1] "C1ql1" "Tnr" "Matn4" "Pcdh15" "Cacng4" "Cdo1" "Nxph1" "Xylt1"
## [9] "Sulf2" "Qpct"
## Deconvolution without prior information.
## B2_1_9F_WT B2_4_9M_WT B2_5_9M_WT B2_6_9M_WT B2_7_12F_WT
## CellType1 0.04880276 0.05878701 0.05329381 0.05442960 0.05154282
## CellType2 0.21346522 0.18212163 0.16247424 0.22798053 0.15553187
## CellType3 0.24898420 0.27395736 0.28107048 0.25143251 0.28428275
## CellType4 0.18593377 0.17330934 0.15781129 0.18301213 0.17131756
## CellType5 0.03393061 0.04112509 0.03786826 0.03209697 0.04273051
## CellType6 0.26888345 0.27069958 0.30748192 0.25104827 0.29459449
An overview of our
proposed method ISLET (Individual Specific celL typE referencing Tool).
A ISLET takes repeatedly measured bulk RNA-seq data,
cell type proportions (known or estimated), and disease status as the
algorithm input. Additional covariates are optional. B
By a hierarchical mixed-effect modeling, ISLET can iteratively retrieve
individual-specific and cell-type-specific gene expression reference
panels. The fixed effect is the group-level average and the random
effect is the individual-level deviance from the group mean.
C Given the individual-specific reference panel, ISLET
can conduct test to identify cell-type-specific differentially expressed
genes (csDEG)
ISLET is formulated as a mixed-effect regression model: \[y = X\beta + Au + \epsilon\]
ISLET needs one input file organized into
SummarizedExperiment objects, combining cases and controls.
So we first convert data into SummarizedExperiment objects.
The original data from GEO doesn’t specify subject ID, so here we treat sample with the same sex, genotype and the same biological repeat number as the same subject. We manually name subject by sex(1 for female 2 for male)-genotype(0 for ctrl 1 for case)-bio rep. For example, a female mouse whose genotype is B6129 with bio rep #1 will be named 101.
######### colData: sample meta-data & input cell type proportions #######
### proportion
proportion <- data.frame(t(estProp_PRF))
rownames(proportion) <- pdata$geo_accession
### meta-data
pheno_bulk_data <- read.csv("/Users/ziyiou/CUHKSZ/23 Fall/Genomics/longitudinal analysis/data/Alzheimer mice/sample_pheno_data.csv", sep=",", row.names = 1)
sample_info <- data.frame(group = ifelse(pdata$`genotype:ch1` == "B6;129S1/SvImJ", "ctrl",
ifelse(pdata$`genotype:ch1` == "3xTg-AD", "case", NA)),
subject_ID = pheno_bulk_data$subject_ID,
CellType1 = proportion$CellType1,
CellType2 = proportion$CellType2,
CellType3 = proportion$CellType3,
CellType4 = proportion$CellType4,
CellType5 = proportion$CellType5,
CellType6 = proportion$CellType6
)
sample_info_age <- data.frame(group = ifelse(pdata$`genotype:ch1` == "B6;129S1/SvImJ", "ctrl",
ifelse(pdata$`genotype:ch1` == "3xTg-AD", "case", NA)),
subject_ID = pheno_bulk_data$subject_ID,
age = as.numeric(pdata$`age (months):ch1`),
CellType1 = proportion$CellType1,
CellType2 = proportion$CellType2,
CellType3 = proportion$CellType3,
CellType4 = proportion$CellType4,
CellType5 = proportion$CellType5,
CellType6 = proportion$CellType6
)
sample_info_age_sex <- data.frame(group = ifelse(pdata$`genotype:ch1` == "B6;129S1/SvImJ", "ctrl",
ifelse(pdata$`genotype:ch1` == "3xTg-AD", "case", NA)),
subject_ID = pheno_bulk_data$subject_ID,
age = as.numeric(pdata$`age (months):ch1`),
sex = ifelse(pdata$`Sex:ch1` == "Female", 0,
ifelse(pdata$`Sex:ch1` == "Male", 1, NA)),
# 0 for female, 1 for male
CellType1 = proportion$CellType1,
CellType2 = proportion$CellType2,
CellType3 = proportion$CellType3,
CellType4 = proportion$CellType4,
CellType5 = proportion$CellType5,
CellType6 = proportion$CellType6
)
rownames(sample_info) = pdata$geo_accession
rownames(sample_info_age) = pdata$geo_accession
rownames(sample_info_age_sex) = pdata$geo_accession
sample_info <- sample_info %>% arrange(subject_ID) %>% arrange(group)
sample_info_age <- sample_info_age %>% arrange(subject_ID) %>% arrange(group)
sample_info_age_sex <- sample_info_age_sex %>% arrange(subject_ID) %>% arrange(group)
######### counts: gene expression value data frame #########
counts = T_bulk
colnames(counts) = pdata$geo_accession
sorted_row_names <- rownames(sample_info)
counts <- data.frame(counts[ , sorted_row_names])mice_se <- SummarizedExperiment(
assays = list(counts = counts),
colData = sample_info)
# unique(colData(mice_se)$group)[1]
mice_se_age <- SummarizedExperiment(
assays = list(counts = counts),
colData = sample_info_age)
mice_se_age_sex <- SummarizedExperiment(
assays = list(counts = counts),
colData = sample_info_age_sex)## Begin: working on data preparation as the input for ISLET algorithm.
## Complete: data preparation for ISLET.
## First couple of elements from cases and controls:
## GSM8061519 GSM8061525 GSM8061540 GSM8061550 GSM8061557 GSM8061559
## Sox17 47 184 82 41 59 46
## Mrpl15 374 1128 513 439 280 343
## Lypla1 364 912 562 444 308 357
## GSM8061509 GSM8061513 GSM8061528 GSM8061534 GSM8061542 GSM8061575
## Sox17 16 85 72 105 124 77
## Mrpl15 95 417 382 677 919 345
## Lypla1 139 388 446 792 890 361
## Design matrices hidded.
## Total cell type number:
## [1] 6
## Cell type categories:
## [1] "CellType1" "CellType2" "CellType3" "CellType4" "CellType5" "CellType6"
## Total sample number and subject number:
## [1] 77 17
## Total case number and ctrl number:
## [1] 11 6
## First several subject ID for the samples:
## [1] 111 111 111 111 111 111 111 112 112 112
## Data preparation type (intercept/slope):
## [1] "intercept"
## Begin: working on data preparation as the input for ISLET algorithm.
## Complete: data preparation for ISLET.
## First couple of elements from cases and controls:
## GSM8061519 GSM8061525 GSM8061540 GSM8061550 GSM8061557 GSM8061559
## Sox17 47 184 82 41 59 46
## Mrpl15 374 1128 513 439 280 343
## Lypla1 364 912 562 444 308 357
## GSM8061509 GSM8061513 GSM8061528 GSM8061534 GSM8061542 GSM8061575
## Sox17 16 85 72 105 124 77
## Mrpl15 95 417 382 677 919 345
## Lypla1 139 388 446 792 890 361
## Design matrices hidded.
## Total cell type number:
## [1] 6
## Cell type categories:
## [1] "CellType1" "CellType2" "CellType3" "CellType4" "CellType5" "CellType6"
## Total sample number and subject number:
## [1] 77 17
## Total case number and ctrl number:
## [1] 11 6
## First several subject ID for the samples:
## [1] 111 111 111 111 111 111 111 112 112 112
## Data preparation type (intercept/slope):
## [1] "slope"
With the curated data input1 from the previous step, now
we can use ISLET to conduct deconvolution and obtain the
individual-specific and cell-type-specific reference panels. This
process can be achieved by running:
The total running time of isletSolve here is 6 min.
The res.sol is the deconvolution result list. For both
case and control group, the deconvolution result is a list of length K,
where K is the number of cell types. For each of the K elements, it is a
matrix of dimension G by N. For each of the K cell types, it stores the
deconvoluted values in a feature (G) by subject (N) matrix.
#View the deconvolution results
caseVal <- caseEst(res.sol)
ctrlVal <- ctrlEst(res.sol)
#length(caseVal) #For cases, a list of 11 cell types' matrices.
#length(ctrlVal) #For controls, a list of 6 cell types' matrices.
################## CellType 1 ##################
## view the reference panels for CellType1,
## for the first 5 genes and first 4 subjects, in Case group.
caseVal$CellType1[1:5, 1:4]## 111 112 113 114
## Sox17 1969.486 1964.361 1971.448 1968.207
## Mrpl15 8444.121 8417.198 8431.861 8422.655
## Lypla1 8299.860 8275.682 8294.749 8283.301
## Tcea1 13463.975 13435.548 13463.117 13447.811
## Rgs20 4583.012 4564.442 4568.838 4564.864
## view the reference panels for CellType1,
## for the first 5 genes and first 4 subjects, in Control group.
ctrlVal$CellType1[1:5, 1:4]## 101 102 103 201
## Sox17 225.2437 223.4167 223.3420 224.1109
## Mrpl15 0.0000 0.0000 0.0000 0.0000
## Lypla1 741.8222 730.1537 730.6757 732.3033
## Tcea1 2745.2806 2738.0202 2741.4625 2745.0388
## Rgs20 0.0000 0.0000 0.0000 0.0000
## 111 112 113 114
## Sox17 0 0 0 0
## Mrpl15 0 0 0 0
## Lypla1 0 0 0 0
## Tcea1 0 0 0 0
## Rgs20 0 0 0 0
## 101 102 103 201
## Sox17 58.793 57.60225 56.33527 58.94634
## Mrpl15 1755.765 1754.87809 1760.10238 1755.05910
## Lypla1 1540.247 1538.64287 1527.29884 1541.41561
## Tcea1 1846.738 1851.40537 1840.94804 1862.40526
## Rgs20 0.000 0.00000 0.00000 0.00000
## 111 112 113 114
## Sox17 0 0 0 0
## Mrpl15 0 0 0 0
## Lypla1 0 0 0 0
## Tcea1 0 0 0 0
## Rgs20 0 0 0 0
## 101 102 103 201
## Sox17 700.4201 697.3487 696.4531 697.7094
## Mrpl15 7713.6543 7701.3812 7709.4721 7697.8531
## Lypla1 6047.4454 6032.4728 6025.7197 6031.8001
## Tcea1 6563.3980 6555.2641 6551.9850 6562.2190
## Rgs20 4311.4950 4301.8949 4303.0512 4298.1654
## 111 112 113 114
## Sox17 712.1604 702.2575 712.8908 709.8051
## Mrpl15 1277.6368 1231.8410 1246.2056 1241.7989
## Lypla1 1398.6513 1355.9685 1379.1372 1370.1712
## Tcea1 1295.1349 1245.2202 1276.9278 1266.6607
## Rgs20 516.9059 484.9252 488.3656 487.1346
## 101 102 103 201
## Sox17 64.53541 62.95829 61.98953 64.62263
## Mrpl15 0.00000 0.00000 0.00000 0.00000
## Lypla1 0.00000 0.00000 0.00000 0.00000
## Tcea1 0.00000 0.00000 0.00000 0.00000
## Rgs20 0.00000 0.00000 0.00000 0.00000
## 111 112 113 114
## Sox17 729.1094 725.3889 728.1152 729.0995
## Mrpl15 2909.2057 2885.5681 2878.7927 2889.1999
## Lypla1 3429.0958 3407.7300 3409.7714 3414.9407
## Tcea1 3099.6281 3077.9872 3076.3328 3087.0519
## Rgs20 3973.0491 3954.0033 3946.7686 3954.5741
## 101 102 103 201
## Sox17 793.1356 791.2056 790.772 791.8918
## Mrpl15 5002.1085 4993.8656 4998.218 4994.6433
## Lypla1 3299.2689 3289.9361 3288.066 3292.4265
## Tcea1 6357.1076 6351.2111 6349.700 6356.6244
## Rgs20 6084.9127 6079.0188 6079.659 6078.8586
## 111 112 113 114
## Sox17 142.1031 133.6268 141.0296 139.4276
## Mrpl15 1563.3951 1520.8680 1525.7712 1527.3008
## Lypla1 1250.1371 1210.7999 1223.0952 1220.4631
## Tcea1 696.4955 651.2932 669.6049 667.8423
## Rgs20 1970.1652 1938.3513 1936.2548 1936.6064
## 101 102 103 201
## Sox17 0 0 0 0
## Mrpl15 0 0 0 0
## Lypla1 0 0 0 0
## Tcea1 0 0 0 0
## Rgs20 0 0 0 0
Now we can test the group effect on individual reference panels, i.e., identifying csDE genes in mean or intercept. In this ‘intercept test’, we assume that the individual-specific reference panel is unchanged across time points. Note that the deconvolution in section 3.3 can be skipped, if one only need to call csDE genes.
The result res.test is a matrix of p-values, in the
dimension of feature by cell type. Each element is the LRT p-value, by
contrasting case group and control group, for one feature in one cell
type.
Total running time of isletTest: 40 min
## csDE testing on cell type 1
## csDE testing on cell type 2
## csDE testing on cell type 3
## csDE testing on cell type 4
## csDE testing on cell type 5
## csDE testing on cell type 6
## csDE testing on 6 cell types finished
## CellType1 CellType2 CellType3 CellType4 CellType5 CellType6
## Sox17 0.2786160 0.13829726 0.18340156 0.157439516 0.9922692 0.29676901
## Mrpl15 0.1634493 0.08624978 0.02911245 0.021594049 0.7718148 0.06871475
## Lypla1 0.2725019 0.08153007 0.04910294 0.020393638 0.9937811 0.12371825
## Tcea1 0.1915976 0.03202852 0.10553086 0.006248477 0.6979749 0.23273610
## Rgs20 0.3587568 0.21220792 0.09572106 0.060230418 0.7142026 0.16298354
## Atp6v1h 0.1567014 0.03734762 0.04962022 0.008230408 0.7419496 0.11884738
Given an additional continuous variable such as time or age, ISLET is able to compare cases and controls in the change-rate of reference profile over time. This is the ‘slope test’. Here, the assumption is that for the participants or subjects in a group, the individual reference profile could change over time, with change-rate fixed by group. At a given time point, there may be no (significant) group effect in the reference panel, but the participants still have distinct underlying reference profiles. Under this setting, it is of interest to test for such difference. Below is an example to detect reference panel change-rate difference between two groups, from data preparation to test.
The result age.test is a matrix of p-values, in the
dimension of feature by cell type. Each element is the LRT p-value, by
contrasting case group and control group, for one feature in one cell
type. In contrast to the (intercept) test described before, here is a
test for difference of the expression CHANGE IN REFERENCE over time,
between cases and controls.
Total running time of isletTest: 32 min
## csDE testing on cell type 1
## csDE testing on cell type 2
## csDE testing on cell type 3
## csDE testing on cell type 4
## csDE testing on cell type 5
## csDE testing on cell type 6
## csDE testing on 6 cell types finished
## CellType1 CellType2 CellType3 CellType4 CellType5 CellType6
## Sox17 0.3232656 0.2013691 0.09575610 0.9276896 0.8419106 0.024398700
## Mrpl15 0.5108663 0.5886100 0.06518907 0.9875777 0.9438597 0.008012505
## Lypla1 0.7375437 0.4241941 0.01835446 0.9094557 0.7560827 0.002035854
## Tcea1 0.8309337 0.3584805 0.03339309 0.8128147 0.5640995 0.002960976
## Rgs20 0.3324975 0.3894123 0.09234321 0.7406966 0.9108752 0.008082987
## Atp6v1h 0.5305558 0.3755488 0.07215422 0.8431109 1.0000000 0.012299150
Total running time of imply: 3 min
## Begin: working on data preparation as the input for imply.
## Complete: data preparation for imply.
## Personalized Panel recovered by lmer.
## Personalized deconvolution done!
#View the subject-specific and cell-type-specific reference panels solved
#by linear mixed-effect models of the first subject
head(result$p.ref[,,1])## CellType1 CellType2 CellType3 CellType4 CellType5 CellType6
## Sox17 2070.035 0 0 695.1377 759.1296 184.3221
## Mrpl15 8557.698 0 0 1237.1534 2813.6616 1768.4115
## Lypla1 8556.239 0 0 1318.7784 3408.1019 1444.6482
## Tcea1 13827.229 0 0 1169.3569 3118.0449 911.9497
## Rgs20 4821.912 0 0 418.3658 3846.9333 2205.5175
## Atp6v1h 43780.636 0 0 5022.7372 17093.4546 4734.1681
## CellType1 CellType2 CellType3 CellType4 CellType5 CellType6
## GSM8061519 0.08976380 0.2174894 0 0.04588418 0.143145724 0.5037169
## GSM8061525 0.09598908 0.4979805 0 0.07178685 0.074878052 0.2593655
## GSM8061540 0.07953159 0.6146913 0 0.05348018 0.016383142 0.2359138
## GSM8061550 0.13505085 0.1095188 0 0.03142922 0.251222651 0.4727785
## GSM8061557 0.15427456 0.2129247 0 0.20850562 0.000000000 0.4242951
## GSM8061559 0.12446574 0.6631387 0 0.11008206 0.002013576 0.1002999
## CellType1 CellType2 CellType3 CellType4 CellType5 CellType6
## GSM8061512 0.00000000 0.07392858 0.05830834 0.8677631 0.00000000 0.00000000
## GSM8061518 0.00000000 0.06755273 0.05154490 0.7130595 0.00000000 0.16784290
## GSM8061533 0.00000000 0.14594556 0.10225932 0.3182801 0.00000000 0.43351499
## GSM8061539 0.00000000 0.00000000 0.10470882 0.5209324 0.00000000 0.37435879
## GSM8061547 0.01033766 0.12493806 0.09992012 0.4993370 0.01191355 0.25355361
## GSM8061572 0.00000000 0.10005417 0.05468368 0.7655992 0.00000000 0.07966296
toast_prop <- proportion[, -1]
islet_prop <- result$imply.prop
common_rownames <- intersect(rownames(toast_prop), rownames(islet_prop))
toast_prop <- toast_prop[common_rownames, ]
islet_prop <- islet_prop[common_rownames, ]
heatmap_toast <- pheatmap(toast_prop, main = "TOAST Proportions", cluster_rows = F, cluster_cols = F,
show_rownames = F)heatmap_islet <- pheatmap(islet_prop, main = "ISLET Proportions", cluster_rows = F, cluster_cols = F,
show_rownames = F)We use package fgsea to perform Gene Sets Enrichment
Analysis (GSEA) across 45 candidate REACTOME pathways with size of at
least 20 genes, using the rank of test statistics in each method.
Here we are interested in the top 20 variable genes in the bulk sample.
gene_variability <- apply(counts, 1, sd)
variability_df <- data.frame(gene = rownames(counts), variability = gene_variability)
# sort according to variablity
variability_df <- variability_df[order(variability_df$variability, decreasing = TRUE), ]
top20_genes <- head(variability_df$gene, 20)
custom_gene_set <- list(
"top20_variable_genes" = top20_genes
)
top20_counts <- counts[top20_genes, ]
pheatmap(
top20_counts,
cluster_rows = TRUE, # 是否对基因进行聚类
cluster_cols = F, # 是否对样本进行聚类
scale = "row", # 对行进行标准化
color = colorRampPalette(c("blue", "white", "red"))(50), # 热图颜色
main = "Heatmap of the Count of Top 20 Variable Genes"
)ISLET using TOAST
as inputWe convert the \(\mathrm{p-value}\) acquired in the above section 3.5 into -log10 scale since the small p-values are essential in csDEG calling. The more significant the gene is, the bigger \(-log_{10}(\mathrm{p-value})\) is. We visualize it by a heatmap.
#significant_gene_matrix <- age.test[rownames(age.test) %in% unlist(csDEG_list), ]
pheatmap(-log10(age.test),
cluster_rows = TRUE,
cluster_cols = TRUE,
color = colorRampPalette(c("blue", "white", "red"))(50),
show_rownames = F,
main = "Heatmap of -log10(p-value) for csDEG")We used BH procedure to control FDR in multiple testing, and then
reported csDEG at FDR < 0.25. (In the ISLET paper, the
author reported csDEG at FDR < 0.1, but here all the FDR we get is
above 0.2, so we report csDEG at FDR < 0.25)
Given that there are so many genes reported, we only explore the top 50 genes with least p-value, meaning that they are the most significant. We want to be stringent with the csDEGs, so we only keep csDEG that appear in only one cell type.
p_values_long <- as.data.frame(as.table(age.test))
colnames(p_values_long) <- c("Gene", "CellType", "PValue")
p_values_long$FDR <- p.adjust(p_values_long$PValue, method = "BH")
csDEG <- p_values_long %>%
filter(FDR < 0.25)
top_csDEG <- csDEG %>%
group_by(CellType) %>%
arrange(CellType, PValue) %>%
slice_head(n = 50) %>%
ungroup()
gene_list_by_celltype <- top_csDEG %>%
split(.$CellType) %>%
lapply(function(df) df$Gene)
#gene_list_by_celltype# 计算 -log(p-value)
top_csDEG <- top_csDEG %>%
mutate(NegLogPValue = -log10(PValue))
# 准备绘图数据:宽格式
heatmap_data <- top_csDEG %>%
dplyr::select(CellType, Gene, NegLogPValue) %>%
pivot_wider(names_from = CellType, values_from = NegLogPValue)# 计算每个基因在不同细胞类型中的出现次数
gene_celltype_count <- heatmap_data %>%
pivot_longer(-Gene, names_to = "CellType", values_to = "NegLogPValue") %>%
filter(!is.na(NegLogPValue)) %>% # 只考虑非 NA 值
group_by(Gene) %>%
summarise(CellTypeCount = n_distinct(CellType)) %>%
filter(CellTypeCount == 1) # 筛选仅在一种细胞类型中出现的基因
# 将这些基因与原数据进行合并,找出这些基因所在的细胞类型
unique_genes <- gene_celltype_count$Gene
unique_genes_data <- heatmap_data %>%
pivot_longer(-Gene, names_to = "CellType", values_to = "NegLogPValue") %>%
filter(Gene %in% unique_genes, !is.na(NegLogPValue)) %>%
dplyr::select(CellType, Gene)
# 创建一个按细胞类型整理的命名列表,每个元素是一个基因向量
gene_list_by_celltype <- unique_genes_data %>%
group_by(CellType) %>%
summarise(Genes = list(Gene), .groups = 'drop')
# 将结果转换为命名列表
gene_list_by_celltype_named_list <- setNames(
lapply(gene_list_by_celltype$Genes, function(x) x),
gene_list_by_celltype$CellType
)
# 打印结果
print(gene_list_by_celltype_named_list)## $CellType1
## [1] Ly6g6f
## 7215 Levels: Sox17 Mrpl15 Lypla1 Tcea1 Rgs20 Atp6v1h Rb1cc1 St18 Pcmtd1 ... Vamp7
##
## $CellType2
## [1] Rtp4 Sult1a1
## 7215 Levels: Sox17 Mrpl15 Lypla1 Tcea1 Rgs20 Atp6v1h Rb1cc1 St18 Pcmtd1 ... Vamp7
##
## $CellType3
## [1] Rnaset2b Adipor2 Sertad1 Cpm Dusp6 Lifr Banp Fkbp5
## [9] Irs2 Kcnj10 Cox7a2l Hspb8 Ccdc124 Ppp2r3c Plekhf1 Gpt2
## [17] Tsc22d3 Errfi1 Dbndd2 Egr1 Zbtb16 Abca8a
## 7215 Levels: Sox17 Mrpl15 Lypla1 Tcea1 Rgs20 Atp6v1h Rb1cc1 St18 Pcmtd1 ... Vamp7
##
## $CellType4
## [1] Ddx3y Sdf2l1 Fam177a
## 7215 Levels: Sox17 Mrpl15 Lypla1 Tcea1 Rgs20 Atp6v1h Rb1cc1 St18 Pcmtd1 ... Vamp7
##
## $CellType5
## [1] Ctla2a
## 7215 Levels: Sox17 Mrpl15 Lypla1 Tcea1 Rgs20 Atp6v1h Rb1cc1 St18 Pcmtd1 ... Vamp7
##
## $CellType6
## [1] Slc40a1 Plpp6 Gt(ROSA)26Sor Abhd6 Ndufaf4
## [6] Bloc1s6 Pmepa1 Rps19bp1 Spred1 Pcnp
## [11] Rpl23a Penk mt-Nd5 Kcnk1 mt-Co1
## [16] Gss Rgs4 Lgals1 Pdp1 Lypla1
## [21] Apln Kdm7a Borcs8
## 7215 Levels: Sox17 Mrpl15 Lypla1 Tcea1 Rgs20 Atp6v1h Rb1cc1 St18 Pcmtd1 ... Vamp7
The csDEG detected by ISLET for CellType1 is
“Ly6g6f”.
# 提取感兴趣的基因
genes_of_interest1_i <- c("Ly6g6f")
counts_subset1_i <- counts[genes_of_interest1_i, ]
# 转置 counts 数据框,使得基因为行,样本为列
counts_long1_i <- as.data.frame(t(counts_subset1_i))
counts_long1_i$Sample_ID <- rownames(counts_long1_i)
# 将 counts_long 和 sample_info_age_sex 合并
data_combined1_i <- merge(counts_long1_i, sample_info_age_sex, by.x = "Sample_ID", by.y = "row.names")
# 将数据从宽格式转换为长格式
data_long1_i <- data_combined1_i %>%
pivot_longer(cols = starts_with("Ly6g6f"),
names_to = "gene",
values_to = "count")
# 汇总数据
data_summarized1_i <- data_long1_i %>%
group_by(age, group, gene) %>%
summarise(total_count = sum(count, na.rm = TRUE), .groups = 'drop') %>%
mutate(log2_count = log2(total_count + 1)) # 添加1以避免log2(0)
# 创建一个基因列表
genes1_i <- c("Ly6g6f")
# 绘图函数
plot_gene <- function(gene_name, data) {
ggplot(data %>% filter(gene == gene_name), aes(x = age, y = log2_count, color = group, group = group)) +
geom_line() +
geom_point() +
labs(title = paste("Gene:", gene_name),
x = "Age (Month)",
y = "Log2(Count)") +
theme_minimal()
}
# 绘制每个基因的图
plots1_i <- lapply(genes1_i, function(gene) {
plot_gene(gene, data_summarized1_i)
})
# 保存或展示图形
library(gridExtra)
do.call(grid.arrange, c(plots1_i))#for (i in 1:length(plots)) {
# ggsave(filename = paste0(genes[i], "_plot.png"), plot = plots[[i]], width = 8, height = 6)}There are papers suggesting that subject carrying CRN mutation (S+m+) will observe increase in “Ly6g6f” (https://www.sciencedirect.com/science/article/pii/S0197458012005933). Mutation in the progranulin gene (GRN) can cause frontotemporal dementia (FTD).
In regard to “Ly6g6f”, the higher the gene expression the greater was the atrophy in right superior frontal gyrus.
# 提取感兴趣的基因
genes_of_interest2_i <- c("Rtp4", "Sult1a1")
counts_subset2_i <- counts[genes_of_interest2_i, ]
# 转置 counts 数据框,使得基因为行,样本为列
counts_long2_i <- as.data.frame(t(counts_subset2_i))
counts_long2_i$Sample_ID <- rownames(counts_long2_i)
# 将 counts_long 和 sample_info_age_sex 合并
data_combined2_i <- merge(counts_long2_i, sample_info_age_sex, by.x = "Sample_ID", by.y = "row.names")
# 将数据从宽格式转换为长格式
data_long2_i <- data_combined2_i %>%
pivot_longer(cols = starts_with("Rtp4") | starts_with("Sult1a1"),
names_to = "gene",
values_to = "count")
# 汇总数据
data_summarized2_i <- data_long2_i %>%
group_by(age, group, gene) %>%
summarise(total_count = sum(count, na.rm = TRUE), .groups = 'drop') %>%
mutate(log2_count = log2(total_count + 1)) # 添加1以避免log2(0)
# 创建一个基因列表
genes2_i <- c("Rtp4", "Sult1a1")
# 绘制每个基因的图
plots2_i <- lapply(genes2_i, function(gene) {
plot_gene(gene, data_summarized2_i)
})
# 保存或展示图形
library(gridExtra)
do.call(grid.arrange, c(plots2_i, ncol = 2))#for (i in 1:length(plots)) {
# ggsave(filename = paste0(genes[i], "_plot.png"), plot = plots[[i]], width = 8, height = 6)}“Rtp4” is a host gene (receptor transporter protein 4 [RTP4]) that can regulate host type I interferon responses and symptoms of experimental cerebral malaria. It’s a potential target for therapy in some neurological diseases. (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7431001/) “Rtp4” is also discovered as the Top 10 most significantly differentially expressed genes in APP-PS1 +antiCD8 mice compared to APP-PS1+PBS (https://www.sciencedirect.com/science/article/pii/S0889159119315739).
“suly1a1” (human sulfotransferase 1A1) has been reported as a new target for structure-based design of drugs to treat cancer according to https://www.cell.com/biophysj/pdf/S0006-3495(21)02756-9.pdf. It encodes an enzyme capable of transferring sulfate groups to a wide variety of endogenous and exogenous compounds, which makes them more water-soluble and easier to excrete. Currently there are NO paper available for pointing out a direct relation between this gene and Alzheimer.
# 提取感兴趣的基因
genes_of_interest3_i <- c("Rnaset2b", "Adipor2", "Sertad1", "Cpm", "Dusp6", "Lifr", "Banp",
"Fkbp5", "Irs2", "Kcnj10")
counts_subset3_i <- counts[genes_of_interest3_i, ]
# 转置 counts 数据框,使得基因为行,样本为列
counts_long3_i <- as.data.frame(t(counts_subset3_i))
counts_long3_i$Sample_ID <- rownames(counts_long3_i)
# 将 counts_long 和 sample_info_age_sex 合并
data_combined3_i <- merge(counts_long3_i, sample_info_age_sex, by.x = "Sample_ID", by.y = "row.names")
# 将数据从宽格式转换为长格式
data_long3_i <- data_combined3_i %>%
pivot_longer(cols = starts_with("Rnaset2b") | starts_with("Adipor2") | starts_with("Sertad1") | starts_with("Cpm") | starts_with("Dusp6") | starts_with("Lifr") | starts_with("Banp") | starts_with("Fkbp5") | starts_with("Irs2") | starts_with("Kcnj10"),
names_to = "gene",
values_to = "count")
# 汇总数据
data_summarized3_i <- data_long3_i %>%
group_by(age, group, gene) %>%
summarise(total_count = sum(count, na.rm = TRUE), .groups = 'drop') %>%
mutate(log2_count = log2(total_count + 1)) # 添加1以避免log2(0)
# 创建一个基因列表
genes3_i <- c("Rnaset2b", "Adipor2", "Sertad1", "Cpm", "Dusp6", "Lifr", "Banp",
"Fkbp5", "Irs2", "Kcnj10")
# 绘制每个基因的图
plots3_i <- lapply(genes3_i, function(gene) {
plot_gene(gene, data_summarized3_i)
})
# 保存或展示图形
library(gridExtra)
do.call(grid.arrange, c(plots3_i, ncol = 3))#for (i in 1:length(plots)) {
# ggsave(filename = paste0(genes[i], "_plot.png"), plot = plots[[i]], width = 8, height = 6)}“Rnaset2b” is a orthologues of the single human RNASET2 gene, presenting on chromosome 17 in the mouse genome. In the CNS, adaptive immune cells and clonally expanded effector memory CD8+ T cells have come into focus in the pathogenesis of aging and Alzheimer’s disease. The observed neuroinflammatory response in Rnaset2−/− mice with considerable numbers of CD8+ T cells exceeds the bystander effects observed in many neurodegenerative disease models. (https://www.nature.com/articles/s41467-021-26880-x)
“Adipor2” (adiponectin receptor 2) is a receptor targeted by Adiponectin. Activation of AdipoR1 and AdipoR2 receptors can trigger various signaling pathways, such as the 5ʹadenosine monophosphate-activated protein kinase (AMPK) and Nuclear Factor-kappa B (NF-κB) pathways. The peripheral actions of adiponectin comprise the regulation of glucose and lipid metabolism, as well as the modulation of inflammation, which both play a substantial role in AD pathophysiology. (https://www.sciencedirect.com/science/article/pii/S1279770724002409)
# 提取感兴趣的基因
genes_of_interest4_i <- c("Ddx3y", "Sdf2l1", "Fam177a")
counts_subset4_i <- counts[genes_of_interest4_i, ]
# 转置 counts 数据框,使得基因为行,样本为列
counts_long4_i <- as.data.frame(t(counts_subset4_i))
counts_long4_i$Sample_ID <- rownames(counts_long4_i)
# 将 counts_long 和 sample_info_age_sex 合并
data_combined4_i <- merge(counts_long4_i, sample_info_age_sex, by.x = "Sample_ID", by.y = "row.names")
# 将数据从宽格式转换为长格式
data_long4_i <- data_combined4_i %>%
pivot_longer(cols = starts_with("Ddx3y") | starts_with("Sdf2l1") | starts_with("Fam177a"),
names_to = "gene",
values_to = "count")
# 汇总数据
data_summarized4_i <- data_long4_i %>%
group_by(age, group, gene) %>%
summarise(total_count = sum(count, na.rm = TRUE), .groups = 'drop') %>%
mutate(log2_count = log2(total_count + 1)) # 添加1以避免log2(0)
# 创建一个基因列表
genes4_i <- c("Ddx3y", "Sdf2l1", "Fam177a")
# 绘制每个基因的图
plots4_i <- lapply(genes4_i, function(gene) {
plot_gene(gene, data_summarized4_i)
})
# 保存或展示图形
library(gridExtra)
do.call(grid.arrange, c(plots4_i, ncol=2))#for (i in 1:length(plots)) {
# ggsave(filename = paste0(genes[i], "_plot.png"), plot = plots[[i]], width = 8, height = 6)}“Ddx3y” is a gene located on the Y chromosome that encodes a member of the DEAD-box RNA helicase family that is active in male germ cells. It encodes a translation initiation factor thought to stabilize the binding of initiation Met-tRNA to the ribosome. A research suggests that the expression of “Ddx3y” will increase in AD patients. (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6981840/).
“Sdf2l1” is a protein coding gene. It’s reported to be significantly upregulated in enrichment analysis in AD patients. (https://www.sciencedirect.com/science/article/pii/S2666354621000302) (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9197405/)
# 提取感兴趣的基因
genes_of_interest5_i <- c("Ctla2a")
counts_subset5_i <- counts[genes_of_interest5_i, ]
# 转置 counts 数据框,使得基因为行,样本为列
counts_long5_i <- as.data.frame(t(counts_subset5_i))
counts_long5_i$Sample_ID <- rownames(counts_long5_i)
# 将 counts_long 和 sample_info_age_sex 合并
data_combined5_i <- merge(counts_long5_i, sample_info_age_sex, by.x = "Sample_ID", by.y = "row.names")
# 将数据从宽格式转换为长格式
data_long5_i <- data_combined5_i %>%
pivot_longer(cols = starts_with("Ctla2a"),
names_to = "gene",
values_to = "count")
# 汇总数据
data_summarized5_i <- data_long5_i %>%
group_by(age, group, gene) %>%
summarise(total_count = sum(count, na.rm = TRUE), .groups = 'drop') %>%
mutate(log2_count = log2(total_count + 1)) # 添加1以避免log2(0)
# 创建一个基因列表
genes5_i <- c("Ctla2a")
# 绘制每个基因的图
plots5_i <- lapply(genes5_i, function(gene) {
plot_gene(gene, data_summarized5_i)
})
# 保存或展示图形
library(gridExtra)
do.call(grid.arrange, c(plots5_i))#for (i in 1:length(plots)) {
# ggsave(filename = paste0(genes[i], "_plot.png"), plot = plots[[i]], width = 8, height = 6)}In a research that demonstrated the induction of cognitive dysfunction induced by Sev in mice to corroborate the signaling pathway and the differentially expressed genes (DEGs), “Ctla2a” is significantly upregulated. Sevoflurane is a common anesthetic widely used in clinical practice, while Anesthetics could induce cognitive dysfunctions, such as Alzheimer’s disease in humans or mice. (https://pubmed.ncbi.nlm.nih.gov/31134678/)
# 提取感兴趣的基因
genes_of_interest6_i <- c("Slc40a1", "Plpp6", "Gt(ROSA)26Sor", "Abhd6", "Ndufaf4", "Bloc1s6",
"Pmepa1", "Rps19bp1", "Spred1", "Pcnp" )
counts_subset6_i <- counts[genes_of_interest6_i, ]
# 转置 counts 数据框,使得基因为行,样本为列
counts_long6_i <- as.data.frame(t(counts_subset6_i))
counts_long6_i$Sample_ID <- rownames(counts_long6_i)
# 将 counts_long 和 sample_info_age_sex 合并
data_combined6_i <- merge(counts_long6_i, sample_info_age_sex, by.x = "Sample_ID", by.y = "row.names")
# 将数据从宽格式转换为长格式
data_long6_i <- data_combined6_i %>%
pivot_longer(cols = starts_with("Slc40a1") | starts_with("Plpp6") | starts_with("Gt(ROSA)26Sor") | starts_with("Abhd6") | starts_with("Ndufaf4") | starts_with("Bloc1s6") | starts_with("Pmepa1") | starts_with("Rps19bp1") | starts_with("Spred1") | starts_with("Pcnp"),
names_to = "gene",
values_to = "count")
# 汇总数据
data_summarized6_i <- data_long6_i %>%
group_by(age, group, gene) %>%
summarise(total_count = sum(count, na.rm = TRUE), .groups = 'drop') %>%
mutate(log2_count = log2(total_count + 1)) # 添加1以避免log2(0)
# 创建一个基因列表
genes6_i <- c("Slc40a1", "Plpp6", "Gt(ROSA)26Sor", "Abhd6", "Ndufaf4", "Bloc1s6",
"Pmepa1", "Rps19bp1", "Spred1", "Pcnp" )
# 绘制每个基因的图
plots6_i <- lapply(genes6_i, function(gene) {
plot_gene(gene, data_summarized6_i)
})
# 保存或展示图形
library(gridExtra)
do.call(grid.arrange, c(plots6_i, ncol = 3))#for (i in 1:length(plots)) {
# ggsave(filename = paste0(genes[i], "_plot.png"), plot = plots[[i]], width = 8, height = 6)}“Slc40a1” is (Solute Carrier Family 40 Member 1) is a Protein Coding gene. It is the only mammalian nonheme cellular iron exporter identified to date. It transports iron from iron storage cells into the blood to optimize systemic iron homeostasis. Mice with global deletion of “Slc40a1” are embryonically lethal, suggesting the essential role of “Slc40a1” in development. In the central nervous system, “Slc40a1” is distributed in most cell types, including neurons, astrocytes, oligodendrocytes, and brain microvascular endothelial cells. It is also essential for mouse embryonic development, forebrain patterning and neural tube closure. Previous studies have shown that “Slc40a1” is likely downregulated in the brain tissues of AD patients and APPswe/PS1dE9 mice.(https://www.nature.com/articles/s41418-020-00685-9)
“Plpp6” (Phospholipid Phosphatase 6) is a Protein
Coding gene. It’s identified as MLR-associated ALS genes by a machine
learning tool RefMap (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8709890/).
Age-associated neurodegenerative diseases such as amyotrophic lateral
sclerosis (ALS), Parkinson’s disease (PD) and Alzheimer’s disease (AD)
are an unmet health need, with significant economic and societal
implications, and an ever-increasing prevalence. Membrane lipid rafts
(MLRs) are specialised plasma membrane microdomains that provide a
platform for intracellular trafficking and signal transduction,
particularly within neurons.
TOAST# 提取感兴趣的基因
genes_of_interest1 <- c("Cldn11","Ermn","Mog","Opalin","Ppp1r14a","Mag",
"Hapln2","Aspa","Gjb1","Slain1")
counts_subset1 <- counts[genes_of_interest1, ]
# 转置 counts 数据框,使得基因为行,样本为列
counts_long1 <- as.data.frame(t(counts_subset1))
counts_long1$Sample_ID <- rownames(counts_long1)
# 将 counts_long 和 sample_info_age_sex 合并
data_combined1 <- merge(counts_long1, sample_info_age_sex, by.x = "Sample_ID", by.y = "row.names")
# 将数据从宽格式转换为长格式
data_long1 <- data_combined1 %>%
pivot_longer(cols = starts_with("Cldn11") | starts_with("Ermn") | starts_with("Mog") | starts_with("Opalin") | starts_with("Ppp1r14a") | starts_with("Mag") | starts_with("Hapln2") | starts_with("Aspa") | starts_with("Gjb1") | starts_with("Slain1"),
names_to = "gene",
values_to = "count")
# 汇总数据
data_summarized1 <- data_long1 %>%
group_by(age, group, gene) %>%
summarise(total_count = sum(count, na.rm = TRUE), .groups = 'drop') %>%
mutate(log2_count = log2(total_count + 1)) # 添加1以避免log2(0)
# 创建一个基因列表
genes1 <- c("Cldn11","Ermn","Mog","Opalin","Ppp1r14a","Mag",
"Hapln2","Aspa","Gjb1","Slain1")
# 绘制每个基因的图
plots1 <- lapply(genes1, function(gene) {
plot_gene(gene, data_summarized1)
})
# 保存或展示图形
library(gridExtra)
do.call(grid.arrange, c(plots1, ncol = 3))# 提取感兴趣的基因
genes_of_interest2 <- c("Gja1","Gjb6" ,"Aqp4","Acsbg1"," Fxyd1","Slc6a11","Slc7a10","Fgfr3", "Sox9", "Ntsr2" )
counts_subset2 <- counts[genes_of_interest2, ]
# 转置 counts 数据框,使得基因为行,样本为列
counts_long2 <- as.data.frame(t(counts_subset2))
counts_long2$Sample_ID <- rownames(counts_long2)
# 将 counts_long 和 sample_info_age_sex 合并
data_combined2 <- merge(counts_long2, sample_info_age_sex, by.x = "Sample_ID", by.y = "row.names")
# 将数据从宽格式转换为长格式
data_long2 <- data_combined2 %>%
pivot_longer(cols = starts_with("Gja1") | starts_with("Gjb6") | starts_with("Aqp4") | starts_with("Acsbg1") | starts_with("Fxyd1") | starts_with("Slc6a11") | starts_with("Slc7a10") | starts_with("Fgfr3") | starts_with("Sox9") | starts_with("Ntsr2"),
names_to = "gene",
values_to = "count")
# 汇总数据
data_summarized2 <- data_long2 %>%
group_by(age, group, gene) %>%
summarise(total_count = sum(count, na.rm = TRUE), .groups = 'drop') %>%
mutate(log2_count = log2(total_count + 1)) # 添加1以避免log2(0)
# 创建一个基因列表
genes2 <- c("Gja1","Gjb6" ,"Aqp4","Acsbg1"," Fxyd1",
"Slc6a11","Slc7a10","Fgfr3", "Sox9", "Ntsr2" )
# 绘制每个基因的图
plots2 <- lapply(genes2, function(gene) {
plot_gene(gene, data_summarized2)
})
# 保存或展示图形
library(gridExtra)
do.call(grid.arrange, c(plots2, ncol = 3))# 提取感兴趣的基因
genes_of_interest3 <- c("Syt1" , "Synpr" , "Gad1" ,
"Ndrg4" , "Scg2" , "Camk2b",
"Grin2b", "Atp1a3","Snap25","Gad2")
counts_subset3 <- counts[genes_of_interest3, ]
# 转置 counts 数据框,使得基因为行,样本为列
counts_long3 <- as.data.frame(t(counts_subset3))
counts_long3$Sample_ID <- rownames(counts_long3)
# 将 counts_long 和 sample_info_age_sex 合并
data_combined3 <- merge(counts_long3, sample_info_age_sex, by.x = "Sample_ID", by.y = "row.names")
# 将数据从宽格式转换为长格式
data_long3 <- data_combined3 %>%
pivot_longer(cols = starts_with("Syt1") | starts_with("Synpr") | starts_with("Gad1") | starts_with("Ndrg4") | starts_with("Scg2") | starts_with("Camk2b") | starts_with("Grin2b") | starts_with("Atp1a3") | starts_with("Snap25") | starts_with("Gad2"),
names_to = "gene",
values_to = "count")
# 汇总数据
data_summarized3 <- data_long3 %>%
group_by(age, group, gene) %>%
summarise(total_count = sum(count, na.rm = TRUE), .groups = 'drop') %>%
mutate(log2_count = log2(total_count + 1)) # 添加1以避免log2(0)
# 创建一个基因列表
genes3 <- c("Syt1" , "Synpr" , "Gad1" ,
"Ndrg4" , "Scg2" , "Camk2b",
"Grin2b", "Atp1a3","Snap25","Gad2")
# 绘制每个基因的图
plots3 <- lapply(genes3, function(gene) {
plot_gene(gene, data_summarized3)
})
# 保存或展示图形
library(gridExtra)
do.call(grid.arrange, c(plots3, ncol = 3))# 提取感兴趣的基因
genes_of_interest4 <- c("Ly6c1" , "Ly6a" , "Cldn5" , "Flt1" ,
"Pglyrp1" ,"Slco1a4" ,"Itm2a" , "Egfl7",
"Cxcl12" , "Hspb1" )
counts_subset4 <- counts[genes_of_interest4, ]
# 转置 counts 数据框,使得基因为行,样本为列
counts_long4 <- as.data.frame(t(counts_subset4))
counts_long4$Sample_ID <- rownames(counts_long4)
# 将 counts_long 和 sample_info_age_sex 合并
data_combined4 <- merge(counts_long4, sample_info_age_sex, by.x = "Sample_ID", by.y = "row.names")
# 将数据从宽格式转换为长格式
data_long4 <- data_combined4 %>%
pivot_longer(cols = starts_with("Ly6c1") | starts_with("Ly6a") | starts_with("Cldn5") | starts_with("Flt1") | starts_with("Pglyrp1") | starts_with("Slco1a4") | starts_with("Itm2a") | starts_with("Egfl7") | starts_with("Cxcl12") | starts_with("Hspb1"),
names_to = "gene",
values_to = "count")
# 汇总数据
data_summarized4 <- data_long4 %>%
group_by(age, group, gene) %>%
summarise(total_count = sum(count, na.rm = TRUE), .groups = 'drop') %>%
mutate(log2_count = log2(total_count + 1)) # 添加1以避免log2(0)
# 创建一个基因列表
genes4 <- c("Ly6c1" , "Ly6a" , "Cldn5" , "Flt1" ,
"Pglyrp1" ,"Slco1a4" ,"Itm2a" , "Egfl7",
"Cxcl12" , "Hspb1" )
# 绘制每个基因的图
plots4 <- lapply(genes4, function(gene) {
plot_gene(gene, data_summarized4)
})
# 保存或展示图形
library(gridExtra)
do.call(grid.arrange, c(plots4, ncol = 3))# 提取感兴趣的基因
genes_of_interest5 <- c("C1qa" , "C1qc" , "Ctss" ,"C1qb" ,
"Tyrobp", "Trem2" , "Fcer1g", "Selplg" ,"P2ry12","Csf1r" )
counts_subset5 <- counts[genes_of_interest5, ]
# 转置 counts 数据框,使得基因为行,样本为列
counts_long5 <- as.data.frame(t(counts_subset5))
counts_long5$Sample_ID <- rownames(counts_long5)
# 将 counts_long 和 sample_info_age_sex 合并
data_combined5 <- merge(counts_long5, sample_info_age_sex, by.x = "Sample_ID", by.y = "row.names")
# 将数据从宽格式转换为长格式
data_long5 <- data_combined5 %>%
pivot_longer(cols = starts_with("C1qa") | starts_with("C1qc") | starts_with("Ctss") | starts_with("C1qb") | starts_with("Tyrobp") | starts_with("Trem2") | starts_with("Fcer1g") | starts_with("Selplg") | starts_with("P2ry12") | starts_with("Csf1r"),
names_to = "gene",
values_to = "count")
# 汇总数据
data_summarized5 <- data_long5 %>%
group_by(age, group, gene) %>%
summarise(total_count = sum(count, na.rm = TRUE), .groups = 'drop') %>%
mutate(log2_count = log2(total_count + 1)) # 添加1以避免log2(0)
# 创建一个基因列表
genes5 <- c("C1qa" , "C1qc" , "Ctss" ,"C1qb" ,
"Tyrobp", "Trem2" , "Fcer1g", "Selplg" ,"P2ry12","Csf1r" )
# 绘制每个基因的图
plots5 <- lapply(genes5, function(gene) {
plot_gene(gene, data_summarized5)
})
# 保存或展示图形
library(gridExtra)
do.call(grid.arrange, c(plots5, ncol = 3))# 提取感兴趣的基因
genes_of_interest6 <- c("C1ql1" , "Tnr" , "Matn4" , "Pcdh15" ,
"Cacng4", "Cdo1" ,"Nxph1" , "Xylt1" , "Sulf2","Qpct")
counts_subset6 <- counts[genes_of_interest6, ]
# 转置 counts 数据框,使得基因为行,样本为列
counts_long6 <- as.data.frame(t(counts_subset6))
counts_long6$Sample_ID <- rownames(counts_long6)
# 将 counts_long 和 sample_info_age_sex 合并
data_combined6 <- merge(counts_long6, sample_info_age_sex, by.x = "Sample_ID", by.y = "row.names")
# 将数据从宽格式转换为长格式
data_long6 <- data_combined6 %>%
pivot_longer(cols = starts_with("C1ql1") | starts_with("Tnr") | starts_with("Matn4") | starts_with("Pcdh15") | starts_with("Cacng4") | starts_with("Cdo1") | starts_with("Nxph1") | starts_with("Xylt1") | starts_with("Sulf2") | starts_with("Qpct"),
names_to = "gene",
values_to = "count")
# 汇总数据
data_summarized6 <- data_long6 %>%
group_by(age, group, gene) %>%
summarise(total_count = sum(count, na.rm = TRUE), .groups = 'drop') %>%
mutate(log2_count = log2(total_count + 1)) # 添加1以避免log2(0)
# 创建一个基因列表
genes6 <- c("C1ql1" , "Tnr" , "Matn4" , "Pcdh15" ,
"Cacng4", "Cdo1" ,"Nxph1" , "Xylt1" , "Sulf2","Qpct")
# 绘制每个基因的图
plots6 <- lapply(genes6, function(gene) {
plot_gene(gene, data_summarized6)
})
# 保存或展示图形
library(gridExtra)
do.call(grid.arrange, c(plots6, ncol = 3))Finally we conduct the enrichment analysis by fgsea.
# 获取所有样本ID
subject_ids <- dimnames(result$p.ref)[[3]]
# 创建一个空列表来存储所有样本的数据
individual_data <- list()
# 遍历所有样本ID,提取相应的数据并存储到列表中
for (subject_id in subject_ids) {
# 找到当前样本ID在数组中的索引
subject_index <- which(dimnames(result$p.ref)[[3]] == subject_id)
# 提取当前样本ID的数据
individual_data[[subject_id]] <- result$p.ref[, , subject_index]
}
# 查看列表的结构,以确认数据已经正确提取
str(individual_data)## List of 17
## $ 111: num [1:7215, 1:6] 2070 8558 8556 13827 4822 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:7215] "Sox17" "Mrpl15" "Lypla1" "Tcea1" ...
## .. ..$ : chr [1:6] "CellType1" "CellType2" "CellType3" "CellType4" ...
## $ 112: num [1:7215, 1:6] 2070 8558 8556 13827 4822 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:7215] "Sox17" "Mrpl15" "Lypla1" "Tcea1" ...
## .. ..$ : chr [1:6] "CellType1" "CellType2" "CellType3" "CellType4" ...
## $ 113: num [1:7215, 1:6] 2070 8558 8556 13827 4822 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:7215] "Sox17" "Mrpl15" "Lypla1" "Tcea1" ...
## .. ..$ : chr [1:6] "CellType1" "CellType2" "CellType3" "CellType4" ...
## $ 114: num [1:7215, 1:6] 2070 8558 8556 13827 4822 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:7215] "Sox17" "Mrpl15" "Lypla1" "Tcea1" ...
## .. ..$ : chr [1:6] "CellType1" "CellType2" "CellType3" "CellType4" ...
## $ 115: num [1:7215, 1:6] 2070 8558 8556 13827 4822 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:7215] "Sox17" "Mrpl15" "Lypla1" "Tcea1" ...
## .. ..$ : chr [1:6] "CellType1" "CellType2" "CellType3" "CellType4" ...
## $ 211: num [1:7215, 1:6] 2070 8558 8556 13827 4822 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:7215] "Sox17" "Mrpl15" "Lypla1" "Tcea1" ...
## .. ..$ : chr [1:6] "CellType1" "CellType2" "CellType3" "CellType4" ...
## $ 212: num [1:7215, 1:6] 2070 8558 8556 13827 4822 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:7215] "Sox17" "Mrpl15" "Lypla1" "Tcea1" ...
## .. ..$ : chr [1:6] "CellType1" "CellType2" "CellType3" "CellType4" ...
## $ 213: num [1:7215, 1:6] 2070 8558 8556 13827 4822 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:7215] "Sox17" "Mrpl15" "Lypla1" "Tcea1" ...
## .. ..$ : chr [1:6] "CellType1" "CellType2" "CellType3" "CellType4" ...
## $ 214: num [1:7215, 1:6] 2070 8558 8556 13827 4822 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:7215] "Sox17" "Mrpl15" "Lypla1" "Tcea1" ...
## .. ..$ : chr [1:6] "CellType1" "CellType2" "CellType3" "CellType4" ...
## $ 215: num [1:7215, 1:6] 2070 8558 8556 13827 4822 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:7215] "Sox17" "Mrpl15" "Lypla1" "Tcea1" ...
## .. ..$ : chr [1:6] "CellType1" "CellType2" "CellType3" "CellType4" ...
## $ 216: num [1:7215, 1:6] 2070 8558 8556 13827 4822 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:7215] "Sox17" "Mrpl15" "Lypla1" "Tcea1" ...
## .. ..$ : chr [1:6] "CellType1" "CellType2" "CellType3" "CellType4" ...
## $ 101: num [1:7215, 1:6] 230 0 1051 2992 0 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:7215] "Sox17" "Mrpl15" "Lypla1" "Tcea1" ...
## .. ..$ : chr [1:6] "CellType1" "CellType2" "CellType3" "CellType4" ...
## $ 102: num [1:7215, 1:6] 230 0 1051 2992 0 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:7215] "Sox17" "Mrpl15" "Lypla1" "Tcea1" ...
## .. ..$ : chr [1:6] "CellType1" "CellType2" "CellType3" "CellType4" ...
## $ 103: num [1:7215, 1:6] 230 0 1051 2992 0 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:7215] "Sox17" "Mrpl15" "Lypla1" "Tcea1" ...
## .. ..$ : chr [1:6] "CellType1" "CellType2" "CellType3" "CellType4" ...
## $ 201: num [1:7215, 1:6] 230 0 1051 2992 0 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:7215] "Sox17" "Mrpl15" "Lypla1" "Tcea1" ...
## .. ..$ : chr [1:6] "CellType1" "CellType2" "CellType3" "CellType4" ...
## $ 202: num [1:7215, 1:6] 230 0 1051 2992 0 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:7215] "Sox17" "Mrpl15" "Lypla1" "Tcea1" ...
## .. ..$ : chr [1:6] "CellType1" "CellType2" "CellType3" "CellType4" ...
## $ 203: num [1:7215, 1:6] 230 0 1051 2992 0 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:7215] "Sox17" "Mrpl15" "Lypla1" "Tcea1" ...
## .. ..$ : chr [1:6] "CellType1" "CellType2" "CellType3" "CellType4" ...
## [1] "111" "112" "113" "114" "115"
## [1] 17
Here we plot the cell type proportion acquired by imply
across time.
101deconv_prop <- result$imply.prop
subjects_to_extract <- 101
# 从 sample_info_age 中提取符合 subject_id 的样本
sample_info_filtered <- sample_info_age[sample_info_age$subject_ID %in% subjects_to_extract, ]
# 获取这些样本的 ID
filtered_sample_ids <- rownames(sample_info_filtered)
# 从 deconv_prop 中提取符合样本 ID 的行
deconv_prop_filtered <- deconv_prop[filtered_sample_ids, ]
# 2. 将两个数据框合并
deconv_prop_filtered$age <- sample_info_filtered$age
deconv_prop_filtered$subject_ID <- sample_info_filtered$subject_ID
deconv_prop_long <- melt(deconv_prop_filtered, id.vars = c("age", "subject_ID"), variable.name = "cell_type", value.name = "cell_proportion")
# 3. 绘图
ggplot(deconv_prop_long, aes(x = age, y = cell_proportion, color = cell_type)) +
geom_line() +
geom_point() +
facet_wrap(~ subject_ID) + # 根据 subject_ID 创建子图
labs(title = "Cell Type Proportion over Age",
x = "Age (Month)",
y = "Cell Type Proportion") +
theme_minimal() +
theme(legend.position = "bottom")102subjects_to_extract <- 102
# 从 sample_info_age 中提取符合 subject_id 的样本
sample_info_filtered <- sample_info_age[sample_info_age$subject_ID %in% subjects_to_extract, ]
# 获取这些样本的 ID
filtered_sample_ids <- rownames(sample_info_filtered)
# 从 deconv_prop 中提取符合样本 ID 的行
deconv_prop_filtered <- deconv_prop[filtered_sample_ids, ]
# 2. 将两个数据框合并
deconv_prop_filtered$age <- sample_info_filtered$age
deconv_prop_filtered$subject_ID <- sample_info_filtered$subject_ID
deconv_prop_long <- melt(deconv_prop_filtered, id.vars = c("age", "subject_ID"), variable.name = "cell_type", value.name = "cell_proportion")
# 3. 绘图
ggplot(deconv_prop_long, aes(x = age, y = cell_proportion, color = cell_type)) +
geom_line() +
geom_point() +
facet_wrap(~ subject_ID) + # 根据 subject_ID 创建子图
labs(title = "Cell Type Proportion over Age",
x = "Age (Month)",
y = "Cell Type Proportion") +
theme_minimal() +
theme(legend.position = "bottom")103subjects_to_extract <- 103
# 从 sample_info_age 中提取符合 subject_id 的样本
sample_info_filtered <- sample_info_age[sample_info_age$subject_ID %in% subjects_to_extract, ]
# 获取这些样本的 ID
filtered_sample_ids <- rownames(sample_info_filtered)
# 从 deconv_prop 中提取符合样本 ID 的行
deconv_prop_filtered <- deconv_prop[filtered_sample_ids, ]
# 2. 将两个数据框合并
deconv_prop_filtered$age <- sample_info_filtered$age
deconv_prop_filtered$subject_ID <- sample_info_filtered$subject_ID
deconv_prop_long <- melt(deconv_prop_filtered, id.vars = c("age", "subject_ID"), variable.name = "cell_type", value.name = "cell_proportion")
# 3. 绘图
ggplot(deconv_prop_long, aes(x = age, y = cell_proportion, color = cell_type)) +
geom_line() +
geom_point() +
facet_wrap(~ subject_ID) + # 根据 subject_ID 创建子图
labs(title = "Cell Type Proportion over Age",
x = "Age (Month)",
y = "Cell Type Proportion") +
theme_minimal() +
theme(legend.position = "bottom")201subjects_to_extract <- 201
# 从 sample_info_age 中提取符合 subject_id 的样本
sample_info_filtered <- sample_info_age[sample_info_age$subject_ID %in% subjects_to_extract, ]
# 获取这些样本的 ID
filtered_sample_ids <- rownames(sample_info_filtered)
# 从 deconv_prop 中提取符合样本 ID 的行
deconv_prop_filtered <- deconv_prop[filtered_sample_ids, ]
# 2. 将两个数据框合并
deconv_prop_filtered$age <- sample_info_filtered$age
deconv_prop_filtered$subject_ID <- sample_info_filtered$subject_ID
deconv_prop_long <- melt(deconv_prop_filtered, id.vars = c("age", "subject_ID"), variable.name = "cell_type", value.name = "cell_proportion")
# 3. 绘图
ggplot(deconv_prop_long, aes(x = age, y = cell_proportion, color = cell_type)) +
geom_line() +
geom_point() +
facet_wrap(~ subject_ID) + # 根据 subject_ID 创建子图
labs(title = "Cell Type Proportion over Age",
x = "Age (Month)",
y = "Cell Type Proportion") +
theme_minimal() +
theme(legend.position = "bottom")202subjects_to_extract <- 202
# 从 sample_info_age 中提取符合 subject_id 的样本
sample_info_filtered <- sample_info_age[sample_info_age$subject_ID %in% subjects_to_extract, ]
# 获取这些样本的 ID
filtered_sample_ids <- rownames(sample_info_filtered)
# 从 deconv_prop 中提取符合样本 ID 的行
deconv_prop_filtered <- deconv_prop[filtered_sample_ids, ]
# 2. 将两个数据框合并
deconv_prop_filtered$age <- sample_info_filtered$age
deconv_prop_filtered$subject_ID <- sample_info_filtered$subject_ID
deconv_prop_long <- melt(deconv_prop_filtered, id.vars = c("age", "subject_ID"), variable.name = "cell_type", value.name = "cell_proportion")
# 3. 绘图
ggplot(deconv_prop_long, aes(x = age, y = cell_proportion, color = cell_type)) +
geom_line() +
geom_point() +
facet_wrap(~ subject_ID) + # 根据 subject_ID 创建子图
labs(title = "Cell Type Proportion over Age",
x = "Age (Month)",
y = "Cell Type Proportion") +
theme_minimal() +
theme(legend.position = "bottom")203subjects_to_extract <- 203
# 从 sample_info_age 中提取符合 subject_id 的样本
sample_info_filtered <- sample_info_age[sample_info_age$subject_ID %in% subjects_to_extract, ]
# 获取这些样本的 ID
filtered_sample_ids <- rownames(sample_info_filtered)
# 从 deconv_prop 中提取符合样本 ID 的行
deconv_prop_filtered <- deconv_prop[filtered_sample_ids, ]
# 2. 将两个数据框合并
deconv_prop_filtered$age <- sample_info_filtered$age
deconv_prop_filtered$subject_ID <- sample_info_filtered$subject_ID
deconv_prop_long <- melt(deconv_prop_filtered, id.vars = c("age", "subject_ID"), variable.name = "cell_type", value.name = "cell_proportion")
# 3. 绘图
ggplot(deconv_prop_long, aes(x = age, y = cell_proportion, color = cell_type)) +
geom_line() +
geom_point() +
facet_wrap(~ subject_ID) + # 根据 subject_ID 创建子图
labs(title = "Cell Type Proportion over Age",
x = "Age (Month)",
y = "Cell Type Proportion") +
theme_minimal() +
theme(legend.position = "bottom")111subjects_to_extract <- 111
# 从 sample_info_age 中提取符合 subject_id 的样本
sample_info_filtered <- sample_info_age[sample_info_age$subject_ID %in% subjects_to_extract, ]
# 获取这些样本的 ID
filtered_sample_ids <- rownames(sample_info_filtered)
# 从 deconv_prop 中提取符合样本 ID 的行
deconv_prop_filtered <- deconv_prop[filtered_sample_ids, ]
# 2. 将两个数据框合并
deconv_prop_filtered$age <- sample_info_filtered$age
deconv_prop_filtered$subject_ID <- sample_info_filtered$subject_ID
deconv_prop_long <- melt(deconv_prop_filtered, id.vars = c("age", "subject_ID"), variable.name = "cell_type", value.name = "cell_proportion")
# 3. 绘图
ggplot(deconv_prop_long, aes(x = age, y = cell_proportion, color = cell_type)) +
geom_line() +
geom_point() +
facet_wrap(~ subject_ID) + # 根据 subject_ID 创建子图
labs(title = "Cell Type Proportion over Age",
x = "Age (Month)",
y = "Cell Type Proportion") +
theme_minimal() +
theme(legend.position = "bottom")112subjects_to_extract <- 112
# 从 sample_info_age 中提取符合 subject_id 的样本
sample_info_filtered <- sample_info_age[sample_info_age$subject_ID %in% subjects_to_extract, ]
# 获取这些样本的 ID
filtered_sample_ids <- rownames(sample_info_filtered)
# 从 deconv_prop 中提取符合样本 ID 的行
deconv_prop_filtered <- deconv_prop[filtered_sample_ids, ]
# 2. 将两个数据框合并
deconv_prop_filtered$age <- sample_info_filtered$age
deconv_prop_filtered$subject_ID <- sample_info_filtered$subject_ID
deconv_prop_long <- melt(deconv_prop_filtered, id.vars = c("age", "subject_ID"), variable.name = "cell_type", value.name = "cell_proportion")
# 3. 绘图
ggplot(deconv_prop_long, aes(x = age, y = cell_proportion, color = cell_type)) +
geom_line() +
geom_point() +
facet_wrap(~ subject_ID) + # 根据 subject_ID 创建子图
labs(title = "Cell Type Proportion over Age",
x = "Age (Month)",
y = "Cell Type Proportion") +
theme_minimal() +
theme(legend.position = "bottom")113subjects_to_extract <- 113
# 从 sample_info_age 中提取符合 subject_id 的样本
sample_info_filtered <- sample_info_age[sample_info_age$subject_ID %in% subjects_to_extract, ]
# 获取这些样本的 ID
filtered_sample_ids <- rownames(sample_info_filtered)
# 从 deconv_prop 中提取符合样本 ID 的行
deconv_prop_filtered <- deconv_prop[filtered_sample_ids, ]
# 2. 将两个数据框合并
deconv_prop_filtered$age <- sample_info_filtered$age
deconv_prop_filtered$subject_ID <- sample_info_filtered$subject_ID
deconv_prop_long <- melt(deconv_prop_filtered, id.vars = c("age", "subject_ID"), variable.name = "cell_type", value.name = "cell_proportion")
# 3. 绘图
ggplot(deconv_prop_long, aes(x = age, y = cell_proportion, color = cell_type)) +
geom_line() +
geom_point() +
facet_wrap(~ subject_ID) + # 根据 subject_ID 创建子图
labs(title = "Cell Type Proportion over Age",
x = "Age (Month)",
y = "Cell Type Proportion") +
theme_minimal() +
theme(legend.position = "bottom")211subjects_to_extract <- 211
# 从 sample_info_age 中提取符合 subject_id 的样本
sample_info_filtered <- sample_info_age[sample_info_age$subject_ID %in% subjects_to_extract, ]
# 获取这些样本的 ID
filtered_sample_ids <- rownames(sample_info_filtered)
# 从 deconv_prop 中提取符合样本 ID 的行
deconv_prop_filtered <- deconv_prop[filtered_sample_ids, ]
# 2. 将两个数据框合并
deconv_prop_filtered$age <- sample_info_filtered$age
deconv_prop_filtered$subject_ID <- sample_info_filtered$subject_ID
deconv_prop_long <- melt(deconv_prop_filtered, id.vars = c("age", "subject_ID"), variable.name = "cell_type", value.name = "cell_proportion")
# 3. 绘图
ggplot(deconv_prop_long, aes(x = age, y = cell_proportion, color = cell_type)) +
geom_line() +
geom_point() +
facet_wrap(~ subject_ID) + # 根据 subject_ID 创建子图
labs(title = "Cell Type Proportion over Age",
x = "Age (Month)",
y = "Cell Type Proportion") +
theme_minimal() +
theme(legend.position = "bottom")212subjects_to_extract <- 212
# 从 sample_info_age 中提取符合 subject_id 的样本
sample_info_filtered <- sample_info_age[sample_info_age$subject_ID %in% subjects_to_extract, ]
# 获取这些样本的 ID
filtered_sample_ids <- rownames(sample_info_filtered)
# 从 deconv_prop 中提取符合样本 ID 的行
deconv_prop_filtered <- deconv_prop[filtered_sample_ids, ]
# 2. 将两个数据框合并
deconv_prop_filtered$age <- sample_info_filtered$age
deconv_prop_filtered$subject_ID <- sample_info_filtered$subject_ID
deconv_prop_long <- melt(deconv_prop_filtered, id.vars = c("age", "subject_ID"), variable.name = "cell_type", value.name = "cell_proportion")
# 3. 绘图
ggplot(deconv_prop_long, aes(x = age, y = cell_proportion, color = cell_type)) +
geom_line() +
geom_point() +
facet_wrap(~ subject_ID) + # 根据 subject_ID 创建子图
labs(title = "Cell Type Proportion over Age",
x = "Age (Month)",
y = "Cell Type Proportion") +
theme_minimal() +
theme(legend.position = "bottom")213subjects_to_extract <- 213
# 从 sample_info_age 中提取符合 subject_id 的样本
sample_info_filtered <- sample_info_age[sample_info_age$subject_ID %in% subjects_to_extract, ]
# 获取这些样本的 ID
filtered_sample_ids <- rownames(sample_info_filtered)
# 从 deconv_prop 中提取符合样本 ID 的行
deconv_prop_filtered <- deconv_prop[filtered_sample_ids, ]
# 2. 将两个数据框合并
deconv_prop_filtered$age <- sample_info_filtered$age
deconv_prop_filtered$subject_ID <- sample_info_filtered$subject_ID
deconv_prop_long <- melt(deconv_prop_filtered, id.vars = c("age", "subject_ID"), variable.name = "cell_type", value.name = "cell_proportion")
# 3. 绘图
ggplot(deconv_prop_long, aes(x = age, y = cell_proportion, color = cell_type)) +
geom_line() +
geom_point() +
facet_wrap(~ subject_ID) + # 根据 subject_ID 创建子图
labs(title = "Cell Type Proportion over Age",
x = "Age (Month)",
y = "Cell Type Proportion") +
theme_minimal() +
theme(legend.position = "bottom")